library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.2.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following object is masked from 'package:stringr':
##
## boundary
## The following object is masked from 'package:tidyr':
##
## fill
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library(ggthemes)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(radiant.data)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
##
## Attaching package: 'radiant.data'
## The following objects are masked from 'package:lubridate':
##
## month, wday
## The following object is masked from 'package:forcats':
##
## as_factor
## The following objects are masked from 'package:purrr':
##
## is_double, is_empty, is_numeric
## The following object is masked from 'package:ggplot2':
##
## diamonds
library(corrplot)
## corrplot 0.84 loaded
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
DATASET.PATH = '../data/ava_downloader/AVA_dataset'
train_df <- read_csv(glue("{DATASET.PATH}/train.csv"))
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
test_df <- read_csv(glue("{DATASET.PATH}/test.csv"))
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
anti <- train_df %>% inner_join(test_df, by='image.id')
anti
intersect(train_df$image.id, test_df$image.id)
## numeric(0)
show.images <- function(n, df){
layout(matrix(1:n,ncol=4,byr=T))
for(i in seq(1,n,1)){
image.id <- df$image.id[i]
plot(load.image(glue("{DATASET.PATH}/images/{image.id}.jpg")))
}
}
files <- read_csv(glue("{DATASET.PATH}/image_attributes.csv")) %>% mutate(image.id = as.integer(str_sub(filename, 0, -5))) %>% select(-filename) %>% mutate(resolution = width*height)
## Parsed with column specification:
## cols(
## aspect_ratio = col_double(),
## depth = col_double(),
## file_size = col_double(),
## filename = col_character(),
## height = col_double(),
## img_size = col_double(),
## width = col_double()
## )
ava.image.dataset <- as.tibble(read_delim(glue("{DATASET.PATH}/AVA.txt"), " ", col_names = F))
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_double(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_double(),
## X11 = col_double(),
## X12 = col_double(),
## X13 = col_double(),
## X14 = col_double(),
## X15 = col_double()
## )
colnames(ava.image.dataset) <- c('index', 'image.id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10','semantic.tag.id1', 'semantic.tag.id2', 'challenge.id')
ava.image.dataset <- ava.image.dataset %>% mutate(image.id=as.integer(image.id)) %>% select(-index)
ava.image.dataset <- ava.image.dataset %>% inner_join(ava.image.dataset %>% select(-semantic.tag.id1, -semantic.tag.id2, -challenge.id) %>% gather(-image.id, key = 'rating', value = 'number') %>% group_by(image.id) %>% summarise(rating.mean=weighted.mean(as.numeric(rating), number), rating.sd = weighted.sd(as.numeric(rating), number)) %>% mutate(rating.mean.bucket=cut(rating.mean, breaks=1:10, labels=1:9)), by = c("image.id"))
There are 255508 files, but 255530 images in dataset.
Date entries without image file were deleted.
The dataset has 255530 rows and 17 columns.
Columns: image.id, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, semantic.tag.id1, semantic.tag.id2, challenge.id, rating.mean, rating.sd, rating.mean.bucket
Sample rows
ava.image.dataset %>% arrange(desc(image.id)) %>% head()
ava.image.dataset <- ava.image.dataset %>% inner_join(files)
## Joining, by = "image.id"
rm(files)
plot.dist.num.ratings <- ava.image.dataset %>% select(image.id, 1,2,3,4,5,6,7,8,9, 10) %>% gather(-image.id, key = 'rating', value = 'number') %>%
mutate(rating=as.integer(rating)) %>% group_by(image.id) %>% summarize(number=sum(number)) %>%
ggplot() +
geom_histogram(aes(number), fill='steelblue', bins=50)
plot.dist.rating.mean <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(rating.mean), fill='steelblue', bins=70)
plot.bar.rating.mean.bucket <- ava.image.dataset %>% group_by(rating.mean.bucket) %>%
summarise(n=n()) %>%
ggplot() +
geom_bar(aes(rating.mean.bucket, n), fill='steelblue', stat='identity') +
scale_y_continuous(labels=comma) +
geom_hline(yintercept = 1000)
plot.dist.aspect.ratio <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(aspect_ratio), fill='steelblue', bins=30) +
scale_x_continuous(breaks=seq(0,5,0.25))
plot.dist.resolution <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(resolution), fill='steelblue', bins=30)
plot.dist.file.size <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(file_size), fill='steelblue', bins=30)
plot.dist.img.size <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(img_size), fill='steelblue', bins=30)
plot.dist.width <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(width), fill='steelblue', bins=30)
plot.dist.height<- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(height), fill='steelblue', bins=30)
plot.dist.depth <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(depth), fill='steelblue', bins=30)
plot.dist.rating.sd <- ava.image.dataset %>%
ggplot() +
geom_histogram(aes(rating.sd), fill='steelblue', bins=30)
grid.arrange(plot.dist.rating.mean, plot.dist.rating.sd, plot.dist.num.ratings, plot.bar.rating.mean.bucket, ncol=2)
grid.arrange(plot.dist.aspect.ratio, plot.dist.file.size, plot.dist.img.size, plot.dist.width, plot.dist.height, plot.dist.depth, plot.dist.resolution, ncol=3)
show.images(8, ava.image.dataset %>% arrange(desc(rating.mean)))
show.images(8, ava.image.dataset %>% arrange(rating.mean))
corrplot(cor(ava.image.dataset %>% select(rating.mean, rating.sd, file_size, width, height, img_size, aspect_ratio, resolution)), method = "pie")
corr.rating.mean.file.size <- ava.image.dataset %>%
ggplot(aes(x=rating.mean, y=file_size)) +
geom_point(color='steelblue', alpha=0.4)
corr.rating.mean.img.size <- ava.image.dataset %>%
ggplot(aes(x=rating.mean, y=img_size)) +
geom_point(color='steelblue', alpha=0.4)
corr.rating.mean.aspect.ratio <- ava.image.dataset %>%
ggplot(aes(x=rating.mean, y=aspect_ratio)) +
geom_point(color='steelblue', alpha=0.4)
corr.rating.mean.resolution <- ava.image.dataset %>%
ggplot(aes(x=rating.mean, y=resolution)) +
geom_point(color='steelblue', alpha=0.4)
grid.arrange(corr.rating.mean.file.size, corr.rating.mean.img.size, corr.rating.mean.aspect.ratio, corr.rating.mean.resolution, ncol=2)